Prepare files and variables so that we can iterate through all of our code automatically
Read in each treatment separately and merge into one object. Normalization can be performed on individual datasets or a combined dataset; however, there are ramifications for choosing to normalize as a group (ie. merging control and treatment) or normalizing each dataset separately.
data_a_raw <- Read10X(data.dir = "/Users/cmay/Desktop/rstudio/jenn_avm_scrna/378a/files/")
data_a <- CreateSeuratObject(counts = data_a_raw, min.cells = 3, min.features = 200, project = "378a", assay = "RNA")
data_b_raw <- Read10X(data.dir = "/Users/cmay/Desktop/rstudio/jenn_avm_scrna/378b/files/")
data_b <- CreateSeuratObject(counts = data_b_raw, min.cells = 3, min.features = 200, project = "378b", assay = "RNA")
data_c_raw <- Read10X(data.dir = "/Users/cmay/Desktop/rstudio/jenn_avm_scrna/410/files/")
data_c <- CreateSeuratObject(counts = data_c_raw, min.cells = 3, min.features = 200, project = "410", assay = "RNA")
data_d_raw <- Read10X(data.dir = "/Users/cmay/Desktop/rstudio/jenn_avm_scrna/control/files/")
data_d <- CreateSeuratObject(counts = data_d_raw, min.cells = 3, min.features = 200, project = "control", assay = "RNA")
avm_treatment <- merge(x = data_a, y = data_b, add.cell.ids = NULL, merge.data = TRUE)
avm_control <- merge(x = data_c, y = data_d, add.cell.ids = NULL, merge.data = TRUE)
avm <- merge(x = data_a, y = c(data_b, data_c, data_d), add.cell.ids = NULL, merge.data = TRUE)
RenameCells(object = avm, add.cell.id = "avm")
## An object of class Seurat
## 15244 features across 18601 samples within 1 assay
## Active assay: RNA (15244 features)
Confirm that your data was properly read in and slotted into the Seurat Object
# Check the metadata
head(avm@meta.data)
## orig.ident nCount_RNA nFeature_RNA
## AAACCTGAGATCCCGC_1 378a 6326 1594
## AAACCTGAGATGTAAC_1 378a 5990 1708
## AAACCTGAGCCCAACC_1 378a 5029 1567
## AAACCTGAGCCGCCTA_1 378a 2630 887
## AAACCTGCAATGGAAT_1 378a 5648 1828
## AAACCTGCACCGAAAG_1 378a 6100 1570
tail(avm@meta.data)
## orig.ident nCount_RNA nFeature_RNA
## TTTGTCAAGTTACGGG_4 control 4988 1381
## TTTGTCACACCAGGCT_4 control 3599 1148
## TTTGTCACATGTCGAT_4 control 7085 1745
## TTTGTCACATTAGGCT_4 control 3229 972
## TTTGTCACATTCACTT_4 control 3363 1160
## TTTGTCATCCCTCAGT_4 control 10615 2537
High mitochondrial RNA is indicative of damaged or dead cells, thus should be filtered
#Mitochondrial RNA
mito.genes1 <- grep(pattern = "^MT", x = rownames(avm@assays[["RNA"]]), value = TRUE) #^MT for human or ^mt- for mice
percent.mito1 <- Matrix::colSums(avm@assays[["RNA"]][mito.genes1])/Matrix::colSums(avm@assays[["RNA"]])
avm <- AddMetaData(object = avm, metadata = percent.mito1, col.name = "percent.mito1")
#Ribosomal RNA
ribo.genes1 <- grep(pattern = "^^RP[SL]", x = rownames(avm@assays[["RNA"]]), value = TRUE)
percent.ribo1 <- Matrix::colSums(avm@assays[["RNA"]][ribo.genes1])/Matrix::colSums(avm@assays[["RNA"]])
avm <- AddMetaData(object = avm, metadata = percent.ribo1, col.name = "percent.ribo1")
VlnPlot(object = avm, group.by="orig.ident", features = c("nFeature_RNA", "nCount_RNA", "percent.mito1","percent.ribo1"), ncol=2, pt.size=0) #Change the size of the beeswarm points
Determine if you should regress out cell cycle genes from the model. If you see clusters correlating with specific cell cycle phases (S, G1, G2M), you may want to regres out cell cycle genes. Caution is advised if your dataset includes progenitor or stem cells, as proliferation may be colinear with cell type identity. First plot the data after performin normal regression (which can include counts, percent mitochondria and percent ribosomes). Next, regress out cell cycle genes (CC.Difference) and check the plot for changes. S, G1 and G2M should be more randomly scattered in the UMAP plot.
# A list of cell cycle markers, from Tirosh et al, 2015, is loaded with Seurat.
s.genes <- cc.genes$s.genes
g2m.genes <- cc.genes$g2m.genes
#Assign Cell Cycle Scores
avm <- CellCycleScoring(avm, s.features = s.genes, g2m.features = g2m.genes, set.ident = TRUE)
#View cell cycle scores and phase assignments
head(avm[[]])
## orig.ident nCount_RNA nFeature_RNA percent.mito1
## AAACCTGAGATCCCGC_1 378a 6326 1594 0.09215934
## AAACCTGAGATGTAAC_1 378a 5990 1708 0.08647746
## AAACCTGAGCCCAACC_1 378a 5029 1567 0.10180950
## AAACCTGAGCCGCCTA_1 378a 2630 887 0.18441065
## AAACCTGCAATGGAAT_1 378a 5648 1828 0.11419972
## AAACCTGCACCGAAAG_1 378a 6100 1570 0.11278689
## percent.ribo1 S.Score G2M.Score Phase old.ident
## AAACCTGAGATCCCGC_1 0.3188429 -0.009238666 -0.1578514 G1 378a
## AAACCTGAGATGTAAC_1 0.2681135 0.512631879 -0.4926310 S 378a
## AAACCTGAGCCCAACC_1 0.2463710 -0.003849444 -0.1554622 G1 378a
## AAACCTGAGCCGCCTA_1 0.2760456 -0.063473054 -0.3277677 G1 378a
## AAACCTGCAATGGAAT_1 0.2478754 0.006643855 0.2086011 G2M 378a
## AAACCTGCACCGAAAG_1 0.3319672 -0.152095808 -0.7533732 G1 378a
avm <- FindVariableFeatures(object = avm, mean.function = ExpMean, dispersion.function = LogVMR, x.low.cutoff = 0.0125, x.high.cutoff = 3, y.cutoff = 0.5, nfeatures = 6000)
#Run PCA to see if cells segregate along cell cycle phase
avm <- ScaleData(object = avm, vars.to.regress = c("nCounts_RNA","percent.mito1")) #can also add "percent.ribo1"
avm <- RunPCA(avm, features = c(s.genes, g2m.genes))
Plot to check that no clusters are correlated with cell cycle phases
DimPlot(avm)
To correct for Cell Cycle Genes, use ScaleData to regress the difference between G2M and S phase scores
avm$CC.Difference <- avm$S.Score - avm$G2M.Score
avm <- ScaleData(avm, vars.to.regress = "CC.Difference", features = rownames(avm))
avm <- RunPCA(avm, features = VariableFeatures(avm), nfeatures.print = 10)
Re-plot to confirm no clusters are correlated with cell cycle phase
DimPlot(avm,reduction = "pca", group.by="orig.ident")
nCount_RNA is the total number of transcripts in the cell nFeature_RNA is the number of unique genes detected in the cell
plot1 <- FeatureScatter(object = avm, feature1 = "percent.mito1", feature2 = "nCount_RNA")
plot2 <- FeatureScatter(object = avm, feature1 = "percent.ribo1", feature2 = "nCount_RNA")
plot3 <- FeatureScatter(object = avm, feature1 = "G2M.Score", feature2 = "percent.ribo1")
plot4 <- FeatureScatter(object = avm, feature1 = "S.Score", feature2 = "percent.mito1")
CombinePlots(plots = list(plot1, plot2, plot3, plot4))
Filter out cells based on number of genes and percent mitochondrial genes Less than 200-500 genes indicate damaged cells. More than 5,000 genes could indicate multiplets.
avm <- subset(x = avm, subset = nFeature_RNA > 200 & nFeature_RNA < 5000 & percent.mito1 < 20)
avm <- subset(x = avm, subset = nCount_RNA > 600 & nCount_RNA < 20000)
Dimensional reduction plot, with cells colored by a quantitative feature
plot3 <- FeaturePlot(object = avm, features = "PECAM1")
plot4 <- FeaturePlot(object = avm, features = "MMP1")
CombinePlots(plots = list(plot3, plot4))
# Violin plot
VlnPlot(object = avm, features = c("CDH5", "PECAM1","MMP1"), same.y.lims = TRUE, group.by="orig.ident", pt.size=0)
DimHeatmap shows primary sources of heterogeneity in a avm and can be useful when trying to decide which PCs to include for further downstream analyses. For instance, if cell cycle genes are high here, it may indicate that you should regress out cell cycle genes.
DimHeatmap(object = avm, reduction = "pca", dims=2, cells = 100, nfeature=100, balanced = TRUE)
Seurat uses a graph-based clustering approach: K-nearest neighbor (KNN) graph, with edges drawn between cells with similar gene expression patterns, and then attempt to partition this graph into highly interconnected ‘quasi-cliques’ or ‘communities’.
FindVariableGenes calculates the average expression and dispersion for each gene, places these genes into bins, and then calculates a z-score for dispersion within each bin. This helps control for the relationship between variability and average expression.
#These parameters will vary based on the data set, heterogeneity in the sample and normalization strategy.
avm <- FindVariableFeatures(object = avm, mean.function = ExpMean, dispersion.function = LogVMR, x.low.cutoff = 0.0125, x.high.cutoff = 3, y.cutoff = 0.5, nfeatures = 5000)
head(x = HVFInfo(object = avm))
## mean variance variance.standardized
## RP11-34P13.7 0.001697484 0.001694705 0.9756154
## AP006222.2 0.176902092 0.194846593 0.9594373
## RP11-206L10.9 0.004486208 0.004587609 0.9970272
## LINC00115 0.001818733 0.001815535 0.9754871
## FAM41C 0.011821764 0.011925231 0.9615060
## NOC2L 0.004849955 0.004826725 0.9690061
variable_genes <- (x = HVFInfo(object = avm))
write.csv(variable_genes, "/Users/cmay/Desktop/rstudio/avm_variable_genes.csv")
Calculate k-nearest neighbors and construct the SNN graph
## Normalize the Data
avm <- SCTransform(avm, vars.to.regress = "percent.ribo1", verbose = FALSE)
avm <- FindVariableFeatures(avm, mean.function = ExpMean,
dispersion.function = LogVMR, x.low.cutoff = 0.0125,
x.high.cutoff = 3, y.cutoff = 0.5, nfeatures = 5000)
avm <- RunPCA(avm,npcs = 50,rev.pca = FALSE,
weight.by.var = TRUE,ndims.print = 1:5,
nfeatures.print = 30,seed.use = 42,approx = TRUE)
Computing Nearest Neighbor Graph
avm <- FindNeighbors(avm,dims = 1:30, # Dimensions
k.param = 15, # K neighbors
nn.method = "annoy",
annoy.metric = "hamming") # This is the distance metric
avm <- FindClusters(avm, resolution = 0.6,
algorithm = 3,
dims.use=1:30,
n.start = 10,
n.iter = 100,
save.SNN=TRUE)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 16495
## Number of edges: 481661
##
## Running smart local moving algorithm...
## Maximum modularity in 10 random starts: 0.8353
## Number of communities: 10
## Elapsed time: 61 seconds
save(avm, file = "/Users/cmay/Desktop/rstudio/avm_clusters.csv")
avm <- RunUMAP(avm, reduction = "pca", dims = 1:30, # can add umap.method = "umap-learn" but don't need
metric = "hamming", n.neighbors = 20, min.dist = 0, # This allows for tighter clustering
n.epochs = 1000, # This repeats the algorithm 1000x to get most consistent result
local.connectivity = 3, # minimum 3 neighbors
verbose = TRUE)
avm_umap1 <- DimPlot(avm, group.by="Phase")
avm_umap2 <- DimPlot(avm, group.by="orig.ident")
avm_umap3 <- DimPlot(avm, group.by="SCT_snn_res.0.6")
CombinePlots(plots = list(avm_umap1, avm_umap2, avm_umap3,avm_umap3))
Seurat can help you find markers that define clusters via differential expression. FindAllMarkers automatically identifes positive and negative markers of a single cluster compared to all other cells. You can also test groups of clusters vs. each other, or against all cells.
# Find all markers of Cluster 1
cluster0.markers <- FindMarkers(object = avm, ident.1 = 0, min.pct = 0.25)
print(x = head(x = cluster0.markers, n = 20))
## p_val avg_logFC pct.1 pct.2 p_val_adj
## MMP1 0 1.3645712 0.987 0.541 0
## CYTL1 0 0.9384128 0.856 0.475 0
## HLA-B 0 0.8582702 0.909 0.485 0
## ADGRF5 0 0.7019012 0.694 0.262 0
## RPL13A 0 0.6983088 1.000 0.998 0
## TRIML2 0 0.6627174 0.585 0.094 0
## NEAT1 0 0.6561820 0.989 0.879 0
## RPL3 0 0.6293213 1.000 0.995 0
## SPARC 0 0.6263405 0.995 0.940 0
## NUAK1 0 0.6226915 0.634 0.197 0
## RPS19 0 0.6193133 0.946 0.653 0
## RPL27A 0 0.6085498 0.999 0.974 0
## TPT1 0 0.5733333 1.000 1.000 0
## SULF2 0 0.5654225 0.707 0.334 0
## ADAMTS18 0 0.5614392 0.722 0.406 0
## S100A6 0 0.5585444 1.000 0.999 0
## A2M 0 0.5508741 0.362 0.039 0
## PECAM1 0 0.4566564 0.983 0.879 0
## RPS21 0 0.4513789 0.898 0.661 0
## FILIP1 0 0.4479808 0.421 0.064 0
write.csv(cluster0.markers, "/Users/cmay/Desktop/rstudio/cluster0.markers.csv")
# Find markers for every cluster compared to all remaining cells, report only the positive ones
#avm.all.markers <- FindAllMarkers(object = avm, only.pos = TRUE, min.pct = 0.25, thresh.use = 0.25, test.use="MAST")
avm.all.markers <- FindAllMarkers(object = avm, test.use="MAST")
avm.all.markers %>% group_by(cluster) %>% top_n(2, avg_logFC)
## # A tibble: 20 x 7
## # Groups: cluster [10]
## p_val avg_logFC pct.1 pct.2 p_val_adj cluster gene
## <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <chr>
## 1 0. 1.36 0.987 0.541 0. 0 MMP1
## 2 0. 0.938 0.856 0.475 0. 0 CYTL1
## 3 0. 1.88 0.939 0.404 0. 1 TFPI2
## 4 0. 0.802 0.734 0.356 0. 1 SPOCK1
## 5 0. 1.12 0.973 0.568 0. 2 MMP1
## 6 0. 0.964 0.671 0.23 0. 2 MT2A
## 7 0. 0.878 1 0.96 0. 3 TUBA1B
## 8 0. 0.860 1 0.992 0. 3 HMGB1
## 9 0. 0.924 0.814 0.142 0. 4 XIST
## 10 0. 0.581 0.815 0.491 0. 4 EMCN
## 11 0. 1.62 0.87 0.459 0. 5 FABP4
## 12 0. 1.36 0.763 0.206 0. 5 ALDH1A1
## 13 6.33e-259 0.760 0.491 0.132 9.11e-255 6 IL1RL1
## 14 9.38e- 89 0.748 0.446 0.309 1.35e- 84 6 HIST1H4C
## 15 0. 0.973 0.851 0.145 0. 7 PDPN
## 16 0. 0.929 0.936 0.559 0. 7 MGST1
## 17 0. 1.91 0.9 0.407 0. 8 MGP
## 18 0. 1.59 0.882 0.469 0. 8 FABP4
## 19 0. 2.59 0.959 0.01 0. 9 COL1A2
## 20 0. 2.27 0.987 0.632 0. 9 FN1
write.csv(avm.all.markers, "/Users/cmay/Desktop/rstudio/avm.all.markers.csv")
Generates an expression heatmap for given cells and genes. Plot the top 10 markers for each cluster.
top10 <- avm.all.markers %>% group_by(cluster) %>% top_n(10, avg_logFC)
avm_heatmap <- DoHeatmap(object = avm, features = top10$gene, label = TRUE)
avm_heatmap
Visualizing marker expression by Treatment Group and by Clusters VlnPlot (shows expression probability distributions across clusters)
avm_violinplot_genes1a <- VlnPlot(object = avm,features=c("PECAM1","MGP","CDH5","SPARC","MMP1","ALDH1A1"), same.y.lims = TRUE, group.by="orig.ident", pt.size=0)
avm_violinplot_genes1b <- VlnPlot(object = avm,features=c("PECAM1","MGP","CDH5","SPARC","MMP1","ALDH1A1"), same.y.lims = TRUE, pt.size=0)
avm_violinplot_genes1a
avm_violinplot_genes1b
Visualizing marker expression by Treatment Group and by Clusters cont. VlnPlot (shows expression probability distributions across clusters)
avm_violinplot_genes2a <- VlnPlot(object = avm,features=c("TGFBI","FN1","CXCL8","GLIPR1","TFPI2","CCL2"), same.y.lims = TRUE, group.by="orig.ident", pt.size=0)
avm_violinplot_genes2b <- VlnPlot(object = avm,features=c("TGFBI","FN1","CXCL8","GLIPR1","TFPI2","CCL2"), same.y.lims = TRUE, pt.size=0)
avm_violinplot_genes2a
avm_violinplot_genes2b
FeaturePlot (visualizes gene expression on a tSNE or PCA plot)
avm_featureplot_genes <- FeaturePlot(object = avm, features = c("PECAM1","TGFBI","CDH5","ALDH1A1","SPARC","MMP1"), cols = c("grey", "blue"), reduction = "umap")
avm_featureplot_genes
avm_ridgeplot_genes <- RidgePlot(object = avm,features=c("PECAM1","TGFBI","CDH5","ALDH1A1","SPARC","MMP1"))
avm_ridgeplot_genes